Mu Jin
## import library
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
import sqlite3
# Load in the hierarchy information
url = "https://raw.githubusercontent.com/bcaffo/MRIcloudT1volumetrics/master/inst/extdata/multilevel_lookup_table.txt"
multilevel_lookup = pd.read_csv(url, sep = "\t").drop(['Level5'], axis = 1)
multilevel_lookup = multilevel_lookup.rename(columns = {
"modify" : "roi",
"modify.1" : "level4",
"modify.2" : "level3",
"modify.3" : "level2",
"modify.4" : "level1"})
multilevel_lookup = multilevel_lookup[['roi', 'level4', 'level3', 'level2', 'level1']]
multilevel_lookup.head()
# load in the subject data
id = 127
subjectData = pd.read_csv("https://raw.githubusercontent.com/smart-stats/ds4bio_book/main/book/assetts/kirby21AllLevels.csv")
subjectData = subjectData.loc[(subjectData.type == 1) & (subjectData.level == 5) & (subjectData.id == id)]
subjectData = subjectData[['roi', 'volume']]
# Merge the subject data with the multilevel data
subjectData = pd.merge(subjectData, multilevel_lookup, on = "roi")
subjectData = subjectData.assign(icv = "ICV")
subjectData = subjectData.assign(comp = subjectData.volume / np.sum(subjectData.volume))
subjectData.head()
| roi | volume | level4 | level3 | level2 | level1 | icv | comp | |
|---|---|---|---|---|---|---|---|---|
| 0 | SFG_L | 12926 | SFG_L | Frontal_L | CerebralCortex_L | Telencephalon_L | ICV | 0.009350 |
| 1 | SFG_R | 10050 | SFG_R | Frontal_R | CerebralCortex_R | Telencephalon_R | ICV | 0.007270 |
| 2 | SFG_PFC_L | 12783 | SFG_L | Frontal_L | CerebralCortex_L | Telencephalon_L | ICV | 0.009247 |
| 3 | SFG_PFC_R | 11507 | SFG_R | Frontal_R | CerebralCortex_R | Telencephalon_R | ICV | 0.008324 |
| 4 | SFG_pole_L | 3078 | SFG_L | Frontal_L | CerebralCortex_L | Telencephalon_L | ICV | 0.002227 |
# Create seperate tables that group by icv&level1, level1&level2, level2&level3, level3&level4
dat_icvl1 = subjectData.drop(['roi','volume','level4', 'level3', 'level2'],\
axis = 1)
dat_l1l2 = subjectData.drop(['roi','volume','level4', 'level3', 'icv'],\
axis = 1)
dat_l2l3 = subjectData.drop(['roi','volume','level4', 'level1','icv'],\
axis = 1)
dat_icvl1.head()
dat_l1l2.head()
dat_l2l3.head()
| level3 | level2 | comp | |
|---|---|---|---|
| 0 | Frontal_L | CerebralCortex_L | 0.009350 |
| 1 | Frontal_R | CerebralCortex_R | 0.007270 |
| 2 | Frontal_L | CerebralCortex_L | 0.009247 |
| 3 | Frontal_R | CerebralCortex_R | 0.008324 |
| 4 | Frontal_L | CerebralCortex_L | 0.002227 |
# create Sankey diagram
t_list = [('icv','level1'),('level1','level2'),('level2','level3')]
def df_sankey(df, cols_tuple_list):
s = pd.DataFrame([])
for t in cols_tuple_list:
s1 = df.groupby(by=[t[0],t[1]],axis=0).count()
s1 = s1.iloc[:,[0]]
s1.columns = ['value']
if s.shape[0]== 0:
s = s1
else:
s = pd.concat([s,s1],axis=0)
s.reset_index(inplace=True)
s.columns = ['source','target','value']
label_set = set(s['source'].unique()) | set(s['target'].unique())
labels = {v: k for k, v in enumerate(label_set)}
s.replace(labels, inplace=True)
return s,list(label_set)
s,labels = df_sankey(subjectData[['icv','level1','level2','level3']],t_list)
fig = go.Figure(data=[go.Sankey(
node = dict(
pad = 6,
thickness = 20,
line = dict(color = "black", width = 1),
label = labels,
),
link = dict(
source = s['source'].values,
target = s['target'].values,
value = s['value'].values
))])
fig.add_annotation(x=0, y=0.2, text="ICV", showarrow=False, yshift=-20, font=dict(size=12, color="black"))
fig.add_annotation(x=0.34, y=0.2, text="Level1", showarrow=False, yshift=-20, font=dict(size=12, color="black"))
fig.add_annotation(x=0.7, y=0.1, text="Level2", showarrow=False, yshift=-20, font=dict(size=12, color="black"))
fig.add_annotation(x=1, y=0.05, text="Level3", showarrow=False, yshift=-20, font=dict(size=12, color="black"))
fig.update_layout(title_text="Sankey", font_size=10, width=1000, height=1000)
fig.show()
#fig.write_html("/Users/mujin/Library/CloudStorage/OneDrive-JohnsHopkins/academic file/ScM/Term 7/ds/Assignment 4/q2.html")
This is the interactive plot in my github public repository
https://github.com/mjin19/ds4ph/blob/main/q2.html
As this could be a little too large to open directly, this link could help open it in a web browser
https://htmlpreview.github.io/?https://github.com/mjin19/ds4ph/blob/main/q2.html
Here I showed how I created a database and read three data files in the database using sqlite:
(base) mujin@Mus-MacBook-Pro ~ % wget https://raw.githubusercontent.com/opencasestudies/ocs-bp-opioid-rural-urban/master/data/simpler_import/county_pop_arcos.csv
wget https://raw.githubusercontent.com/opencasestudies/ocs-bp-opioid-rural-urban/master/data/simpler_import/land_area.csv
wget https://raw.githubusercontent.com/opencasestudies/ocs-bp-opioid-rural-urban/master/data/simpler_import/county_annual.csv
--2023-02-18 23:51:04-- https://raw.githubusercontent.com/opencasestudies/ocs-bp-opioid-rural-urban/master/data/simpler_import/county_pop_arcos.csv
(base) mujin@Mus-MacBook-Pro ~ % sqlite3 opioid.db
SQLite version 3.39.3 2022-09-05 11:02:23
sqlite> .mode csv
sqlite> .import county_pop_arcos.csv population
sqlite> .import county_annual.csv annual
sqlite> .import land_area.csv land
sqlite> .tables
annual county_pop_arcos land_area
county_annual land population
# Do the data wrangling in Python
## create a connection
conn = sqlite3.connect('/Users/mujin/Library/CloudStorage/OneDrive-JohnsHopkins/academic file/ScM/Term 7/ds/Assignment 4/opioid.db')
## read tables into python
population = pd.read_sql("SELECT * FROM population", conn)
annual = pd.read_sql("SELECT * FROM annual", conn)
land = pd.read_sql("SELECT * FROM land", conn)
# check head rows
population.head(10)
| Unnamed: 0 | BUYER_COUNTY | BUYER_STATE | countyfips | STATE | COUNTY | county_name | NAME | variable | year | population | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | AUTAUGA | AL | 1001 | 1 | 1 | Autauga | Autauga County, Alabama | B01003_001 | 2006 | 51328 |
| 1 | 2 | BALDWIN | AL | 1003 | 1 | 3 | Baldwin | Baldwin County, Alabama | B01003_001 | 2006 | 168121 |
| 2 | 3 | BARBOUR | AL | 1005 | 1 | 5 | Barbour | Barbour County, Alabama | B01003_001 | 2006 | 27861 |
| 3 | 4 | BIBB | AL | 1007 | 1 | 7 | Bibb | Bibb County, Alabama | B01003_001 | 2006 | 22099 |
| 4 | 5 | BLOUNT | AL | 1009 | 1 | 9 | Blount | Blount County, Alabama | B01003_001 | 2006 | 55485 |
| 5 | 6 | BULLOCK | AL | 1011 | 1 | 11 | Bullock | Bullock County, Alabama | B01003_001 | 2006 | 10776 |
| 6 | 7 | BUTLER | AL | 1013 | 1 | 13 | Butler | Butler County, Alabama | B01003_001 | 2006 | 20815 |
| 7 | 8 | CALHOUN | AL | 1015 | 1 | 15 | Calhoun | Calhoun County, Alabama | B01003_001 | 2006 | 115388 |
| 8 | 9 | CHAMBERS | AL | 1017 | 1 | 17 | Chambers | Chambers County, Alabama | B01003_001 | 2006 | 34945 |
| 9 | 10 | CHEROKEE | AL | 1019 | 1 | 19 | Cherokee | Cherokee County, Alabama | B01003_001 | 2006 | 25466 |
annual.head(10)
| Unnamed: 0 | BUYER_COUNTY | BUYER_STATE | year | count | DOSAGE_UNIT | countyfips | |
|---|---|---|---|---|---|---|---|
| 0 | 1 | ABBEVILLE | SC | 2006 | 877 | 363620.0 | 45001.0 |
| 1 | 2 | ABBEVILLE | SC | 2007 | 908 | 402940.0 | 45001.0 |
| 2 | 3 | ABBEVILLE | SC | 2008 | 871 | 424590.0 | 45001.0 |
| 3 | 4 | ABBEVILLE | SC | 2009 | 930 | 467230.0 | 45001.0 |
| 4 | 5 | ABBEVILLE | SC | 2010 | 1197 | 539280.0 | 45001.0 |
| 5 | 6 | ABBEVILLE | SC | 2011 | 1327 | 566560.0 | 45001.0 |
| 6 | 7 | ABBEVILLE | SC | 2012 | 1509 | 589010.0 | 45001.0 |
| 7 | 8 | ABBEVILLE | SC | 2013 | 1572 | 596420.0 | 45001.0 |
| 8 | 9 | ABBEVILLE | SC | 2014 | 1558 | 641350.0 | 45001.0 |
| 9 | 10 | ACADIA | LA | 2006 | 5802 | 1969720.0 | 22001.0 |
land.head(10)
| Unnamed: 0 | Areaname | STCOU | LND010190F | LND010190D | LND010190N1 | LND010190N2 | LND010200F | LND010200D | LND010200N1 | ... | LND110210N1 | LND110210N2 | LND210190F | LND210190D | LND210190N1 | LND210190N2 | LND210200F | LND210200D | LND210200N1 | LND210200N2 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | UNITED STATES | 0 | 0 | 3787425.08 | 0 | 0 | 0 | 3794083.06 | 0 | ... | 0 | 0 | 0 | 251083.35 | 0 | 0 | 0 | 256644.62 | 0 | 0 |
| 1 | 2 | ALABAMA | 1000 | 0 | 52422.94 | 0 | 0 | 0 | 52419.02 | 0 | ... | 0 | 0 | 0 | 1672.71 | 0 | 0 | 0 | 1675.01 | 0 | 0 |
| 2 | 3 | Autauga, AL | 1001 | 0 | 604.49 | 0 | 0 | 0 | 604.45 | 0 | ... | 0 | 0 | 0 | 8.48 | 0 | 0 | 0 | 8.48 | 0 | 0 |
| 3 | 4 | Baldwin, AL | 1003 | 0 | 2027.08 | 0 | 0 | 0 | 2026.93 | 0 | ... | 0 | 0 | 0 | 430.55 | 0 | 0 | 0 | 430.58 | 0 | 0 |
| 4 | 5 | Barbour, AL | 1005 | 0 | 904.59 | 0 | 0 | 0 | 904.52 | 0 | ... | 0 | 0 | 0 | 19.59 | 0 | 0 | 0 | 19.61 | 0 | 0 |
| 5 | 6 | Bibb, AL | 1007 | 0 | 625.50 | 0 | 0 | 0 | 626.16 | 0 | ... | 0 | 0 | 0 | 3.14 | 0 | 0 | 0 | 3.14 | 0 | 0 |
| 6 | 7 | Blount, AL | 1009 | 0 | 650.65 | 0 | 0 | 0 | 650.60 | 0 | ... | 0 | 0 | 0 | 4.97 | 0 | 0 | 0 | 5.02 | 0 | 0 |
| 7 | 8 | Bullock, AL | 1011 | 0 | 626.11 | 0 | 0 | 0 | 626.06 | 0 | ... | 0 | 0 | 0 | 1.04 | 0 | 0 | 0 | 1.04 | 0 | 0 |
| 8 | 9 | Butler, AL | 1013 | 0 | 777.99 | 0 | 0 | 0 | 777.92 | 0 | ... | 0 | 0 | 0 | 1.05 | 0 | 0 | 0 | 1.05 | 0 | 0 |
| 9 | 10 | Calhoun, AL | 1015 | 0 | 612.35 | 0 | 0 | 0 | 612.32 | 0 | ... | 0 | 0 | 0 | 3.85 | 0 | 0 | 0 | 3.86 | 0 | 0 |
10 rows × 35 columns
# print out some of the missing data in the annual dataset
annual[annual['countyfips'].isna()].head(10)
| Unnamed: 0 | BUYER_COUNTY | BUYER_STATE | year | count | DOSAGE_UNIT | countyfips | |
|---|---|---|---|---|---|---|---|
| 187 | 188 | ADJUNTAS | PR | 2006 | 147 | 102800.0 | NaN |
| 188 | 189 | ADJUNTAS | PR | 2007 | 153 | 104800.0 | NaN |
| 189 | 190 | ADJUNTAS | PR | 2008 | 153 | 45400.0 | NaN |
| 190 | 191 | ADJUNTAS | PR | 2009 | 184 | 54200.0 | NaN |
| 191 | 192 | ADJUNTAS | PR | 2010 | 190 | 56200.0 | NaN |
| 192 | 193 | ADJUNTAS | PR | 2011 | 186 | 65530.0 | NaN |
| 193 | 194 | ADJUNTAS | PR | 2012 | 138 | 57330.0 | NaN |
| 194 | 195 | ADJUNTAS | PR | 2013 | 138 | 65820.0 | NaN |
| 195 | 196 | ADJUNTAS | PR | 2014 | 90 | 59490.0 | NaN |
| 196 | 197 | AGUADA | PR | 2006 | 160 | 49200.0 | NaN |
# Place other than Puerto Rico (PR)
annual[(annual['countyfips'].isna()) & (annual['BUYER_STATE'] != 'PR')].head(10)
| Unnamed: 0 | BUYER_COUNTY | BUYER_STATE | year | count | DOSAGE_UNIT | countyfips | |
|---|---|---|---|---|---|---|---|
| 10071 | 10072 | GUAM | GU | 2006 | 319 | 265348.0 | NaN |
| 10072 | 10073 | GUAM | GU | 2007 | 330 | 275600.0 | NaN |
| 10073 | 10074 | GUAM | GU | 2008 | 313 | 286900.0 | NaN |
| 10074 | 10075 | GUAM | GU | 2009 | 390 | 355300.0 | NaN |
| 10075 | 10076 | GUAM | GU | 2010 | 510 | 413800.0 | NaN |
| 10076 | 10077 | GUAM | GU | 2011 | 559 | 475600.0 | NaN |
| 10077 | 10078 | GUAM | GU | 2012 | 616 | 564800.0 | NaN |
| 10078 | 10079 | GUAM | GU | 2013 | 728 | 623200.0 | NaN |
| 10079 | 10080 | GUAM | GU | 2014 | 712 | 558960.0 | NaN |
| 17429 | 17430 | MONTGOMERY | AR | 2006 | 469 | 175390.0 | NaN |
# update countyfips of Montgomery
annual.loc[(annual['BUYER_STATE'] == 'AR') & (annual['BUYER_COUNTY'] == 'MONTGOMERY'), 'countyfips'] = '05097'
annual[(annual['countyfips'].isna()) & (annual['BUYER_STATE'] != 'PR')].head(10)
| Unnamed: 0 | BUYER_COUNTY | BUYER_STATE | year | count | DOSAGE_UNIT | countyfips | |
|---|---|---|---|---|---|---|---|
| 10071 | 10072 | GUAM | GU | 2006 | 319 | 265348.0 | NaN |
| 10072 | 10073 | GUAM | GU | 2007 | 330 | 275600.0 | NaN |
| 10073 | 10074 | GUAM | GU | 2008 | 313 | 286900.0 | NaN |
| 10074 | 10075 | GUAM | GU | 2009 | 390 | 355300.0 | NaN |
| 10075 | 10076 | GUAM | GU | 2010 | 510 | 413800.0 | NaN |
| 10076 | 10077 | GUAM | GU | 2011 | 559 | 475600.0 | NaN |
| 10077 | 10078 | GUAM | GU | 2012 | 616 | 564800.0 | NaN |
| 10078 | 10079 | GUAM | GU | 2013 | 728 | 623200.0 | NaN |
| 10079 | 10080 | GUAM | GU | 2014 | 712 | 558960.0 | NaN |
| 18501 | 18502 | NORTHERN MARIANA ISLANDS | MP | 2006 | 165 | 117850.0 | NaN |
# check for missing data
annual[annual['BUYER_COUNTY'].isna()]
| Unnamed: 0 | BUYER_COUNTY | BUYER_STATE | year | count | DOSAGE_UNIT | countyfips | |
|---|---|---|---|---|---|---|---|
| 27741 | 27742 | None | AE | 2006 | 2 | 330.0 | NaN |
| 27742 | 27743 | None | CA | 2006 | 47 | 12600.0 | NaN |
| 27743 | 27744 | None | CT | 2006 | 305 | 78700.0 | NaN |
| 27744 | 27745 | None | CT | 2007 | 112 | 30900.0 | NaN |
| 27745 | 27746 | None | CT | 2008 | 48 | 15000.0 | NaN |
| 27746 | 27747 | None | FL | 2006 | 9 | 900.0 | NaN |
| 27747 | 27748 | None | FL | 2007 | 7 | 700.0 | NaN |
| 27748 | 27749 | None | GA | 2006 | 114 | 51700.0 | NaN |
| 27749 | 27750 | None | IA | 2006 | 7 | 2300.0 | NaN |
| 27750 | 27751 | None | IN | 2006 | 292 | 39300.0 | NaN |
| 27751 | 27752 | None | MA | 2006 | 247 | 114900.0 | NaN |
| 27752 | 27753 | None | NV | 2006 | 380 | 173600.0 | NaN |
| 27753 | 27754 | None | NV | 2007 | 447 | 200600.0 | NaN |
| 27754 | 27755 | None | NV | 2008 | 5 | 2200.0 | NaN |
| 27755 | 27756 | None | OH | 2006 | 23 | 5100.0 | NaN |
| 27756 | 27757 | None | PR | 2006 | 10 | 17800.0 | NaN |
| 27757 | 27758 | None | PR | 2007 | 2 | 1300.0 | NaN |
# drop records with missing county data
annual = annual.dropna(subset=['BUYER_COUNTY'])
# check for missing data again
annual[annual['BUYER_COUNTY'].isna()]
| Unnamed: 0 | BUYER_COUNTY | BUYER_STATE | year | count | DOSAGE_UNIT | countyfips |
|---|
# select three needed columns
land_area = land[['Areaname', 'STCOU', 'LND110210D']].copy()
# check the head rows
land_area.head(10)
| Areaname | STCOU | LND110210D | |
|---|---|---|---|
| 0 | UNITED STATES | 0 | 3531905.43 |
| 1 | ALABAMA | 1000 | 50645.33 |
| 2 | Autauga, AL | 1001 | 594.44 |
| 3 | Baldwin, AL | 1003 | 1589.78 |
| 4 | Barbour, AL | 1005 | 884.88 |
| 5 | Bibb, AL | 1007 | 622.58 |
| 6 | Blount, AL | 1009 | 644.78 |
| 7 | Bullock, AL | 1011 | 622.81 |
| 8 | Butler, AL | 1013 | 776.83 |
| 9 | Calhoun, AL | 1015 | 605.87 |
# rename the STCOU column to countyfips
land_area = land_area.rename(columns={'STCOU': 'countyfips'})
# check the head rows
land_area.head(10)
| Areaname | countyfips | LND110210D | |
|---|---|---|---|
| 0 | UNITED STATES | 0 | 3531905.43 |
| 1 | ALABAMA | 1000 | 50645.33 |
| 2 | Autauga, AL | 1001 | 594.44 |
| 3 | Baldwin, AL | 1003 | 1589.78 |
| 4 | Barbour, AL | 1005 | 884.88 |
| 5 | Bibb, AL | 1007 | 622.58 |
| 6 | Blount, AL | 1009 | 644.78 |
| 7 | Bullock, AL | 1011 | 622.81 |
| 8 | Butler, AL | 1013 | 776.83 |
| 9 | Calhoun, AL | 1015 | 605.87 |
# Combine the two datasets
county_info = pd.merge(population, land_area, on='countyfips', how='left')
county_info.head(10)
| Unnamed: 0 | BUYER_COUNTY | BUYER_STATE | countyfips | STATE | COUNTY | county_name | NAME | variable | year | population | Areaname | LND110210D | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | AUTAUGA | AL | 1001 | 1 | 1 | Autauga | Autauga County, Alabama | B01003_001 | 2006 | 51328 | Autauga, AL | 594.44 |
| 1 | 2 | BALDWIN | AL | 1003 | 1 | 3 | Baldwin | Baldwin County, Alabama | B01003_001 | 2006 | 168121 | Baldwin, AL | 1589.78 |
| 2 | 3 | BARBOUR | AL | 1005 | 1 | 5 | Barbour | Barbour County, Alabama | B01003_001 | 2006 | 27861 | Barbour, AL | 884.88 |
| 3 | 4 | BIBB | AL | 1007 | 1 | 7 | Bibb | Bibb County, Alabama | B01003_001 | 2006 | 22099 | Bibb, AL | 622.58 |
| 4 | 5 | BLOUNT | AL | 1009 | 1 | 9 | Blount | Blount County, Alabama | B01003_001 | 2006 | 55485 | Blount, AL | 644.78 |
| 5 | 6 | BULLOCK | AL | 1011 | 1 | 11 | Bullock | Bullock County, Alabama | B01003_001 | 2006 | 10776 | Bullock, AL | 622.81 |
| 6 | 7 | BUTLER | AL | 1013 | 1 | 13 | Butler | Butler County, Alabama | B01003_001 | 2006 | 20815 | Butler, AL | 776.83 |
| 7 | 8 | CALHOUN | AL | 1015 | 1 | 15 | Calhoun | Calhoun County, Alabama | B01003_001 | 2006 | 115388 | Calhoun, AL | 605.87 |
| 8 | 9 | CHAMBERS | AL | 1017 | 1 | 17 | Chambers | Chambers County, Alabama | B01003_001 | 2006 | 34945 | Chambers, AL | 596.53 |
| 9 | 10 | CHEROKEE | AL | 1019 | 1 | 19 | Cherokee | Cherokee County, Alabama | B01003_001 | 2006 | 25466 | Cherokee, AL | 553.70 |
# check results
print(len(land))
print(len(land_area))
print(len(population))
print(len(county_info))
3198 3198 28265 28265
fig = px.scatter(annual, x='year', y='count',color='BUYER_STATE', labels={'BUYER_STATE': 'State'}, render_mode='svg')
fig.update_layout(height=800, width=1000)
fig.show()
#fig.write_html("/Users/mujin/Library/CloudStorage/OneDrive-JohnsHopkins/academic file/ScM/Term 7/ds/Assignment 4/q3.html")
This is the interactive plot in my github public repository
https://github.com/mjin19/ds4ph/blob/main/q4.html
As this could be a little too large to open directly, this link could help open it in a web browser
https://htmlpreview.github.io/?https://github.com/mjin19/ds4ph/blob/main/q4.html